imdb_ratings explicit from IMDb (for obvious
reasons)dim(simpsons)
## [1] 725 12
names(simpsons)
## [1] "id" "title" "description"
## [4] "original_air_date" "production_code" "directed_by"
## [7] "written_by" "season" "number_in_season"
## [10] "number_in_series" "us_viewers_in_millions" "imdb_rating"
summary(simpsons)
## id title description original_air_date
## Min. : 0 Length:725 Length:725 Min. :1989-12-17
## 1st Qu.:181 Class :character Class :character 1st Qu.:1997-10-26
## Median :362 Mode :character Mode :character Median :2005-11-27
## Mean :362 Mean :2006-01-04
## 3rd Qu.:543 3rd Qu.:2014-03-16
## Max. :724 Max. :2022-05-22
## production_code directed_by written_by season
## Length:725 Length:725 Length:725 Min. : 1.00
## Class :character Class :character Class :character 1st Qu.: 9.00
## Mode :character Mode :character Mode :character Median :17.00
## Mean :16.94
## 3rd Qu.:25.00
## Max. :33.00
## number_in_season number_in_series us_viewers_in_millions imdb_rating
## Min. : 1.00 Min. : 1 Length:725 Min. :4.000
## 1st Qu.: 6.00 1st Qu.: 182 Class :character 1st Qu.:6.600
## Median : 12.00 Median : 363 Mode :character Median :7.100
## Mean : 15.98 Mean : 3123 Mean :7.166
## 3rd Qu.: 17.00 3rd Qu.: 544 3rd Qu.:7.700
## Max. :1920.00 Max. :712713 Max. :9.300
sum(is.na(simpsons))
## [1] 0
head(simpsons) %>% rmarkdown::paged_table()
summary(simpsons$length_description)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 18.00 23.00 23.92 28.00 98.00
imdb_rating to ratingus_viewers_in_millions to viewerslength_description and age (Time since
original air date)
age analysis is conducted will be
based off this date as newer seasons start
around this timesimpsons$description[which(simpsons$length_description == 5)]
## [1] "Marge becomes a real-estate agent."
## [2] "Marge accidentally gets breast implants."
## [3] "Fat Tony becomes Maggie's godfather."
Initial observations off of summary:
Potential errors in data based off summary:
Error in number_in_series (Max 712713)
Number episode in season (Max 1920)
No blatant missing values but some numeric values listed as different type
us_viewers_in_million) is char typeRefer to README for other preliminary questions
rmarkdown::paged_table(head(arrange(simpsons, desc(number_in_series))))
#DT::datatable(head(arrange(simpsons, desc(number_in_series))))
subsetDF <- c("id", "title", "production_code", "directors", "writers", "number_in_season", "number_in_series")
simpsons_sub <- simpsons[,subsetDF]
number_in_series in which reveals there
cases where records were wrong
subsetDF
## [1] "id" "title" "production_code" "directors"
## [5] "writers" "number_in_season" "number_in_series"
#simpsons[which.max(simpsons$number_in_season),]
#head(arrange(simpsons, desc(number_in_season)))
rmarkdown::paged_table(head(arrange(simpsons_sub[c("id", "title","number_in_season")], desc(number_in_season))))
num_in_series
number_in_season is 1213
number_in_series use
id, however number_in_season adjusted
accordingly as two episodessimpsons$viewers <- as.numeric(simpsons$us_viewers_in_millions)
## Warning: NAs introduced by coercion
which(is.na(as.numeric(simpsons$viewers)))
## [1] 160 161 173
simpsons[c(160, 161, 173),c("us_viewers_in_millions", "viewers")]
## # A tibble: 3 × 2
## us_viewers_in_millions viewers
## <chr> <dbl>
## 1 N/A NA
## 2 N/A NA
## 3 N/A NA
simpsons_rm <- simpsons[-c(160, 161, 173),]
as.numeric() for further
applicationis.na() didn’t detect because it was of class
char(Match id to 607, 679, 709)
Note: Graphs expected to fall under time series analysis, and using
id/original_air_date/age are
somewhat interchangeable since they are effectively
factors/‘categorical.’
#rating_v_sason <- ggplot(data = tempSimps, aes(x = season, y = rating, color = season))
temp <- ggplot(data = tempSimps, aes(x = season, y = rating, color=season)) +
geom_boxplot() + ggtitle("IMDB Rating vs Simpsons Seasons") + xlab("Season") + ylab("Rating") +
scale_y_continuous(name="IMDB Rating",limits=c(4, 10))
#theme(legend.position = "none") +
ggplotly(temp)
sznAvg <- tempSimps %>% group_by(season) %>% summarise(sznAvg = mean(rating))
temp2 <- ggplot(data = sznAvg, aes(x = season, y = sznAvg)) + geom_point() +
ggtitle("Average IMDB Rating vs Simpsons Season") + xlab("Season") + ylab("Average IMDB Rating")
ggplotly(temp2)
viewerstempViewers <- ggplot(data = tempSimps, aes(x = season, y = as.numeric(viewers), color=season)) +
geom_boxplot() + ggtitle("Viewers vs Simpsons Seasons") + xlab("Season") + ylab("Viewers (p/ mil)") +
theme(legend.position = "none")
ggplotly(tempViewers)
tempJitter <- ggplot(tempSimps,aes(x=season, y= as.numeric(viewers), color=season)) +
geom_boxplot() +
geom_jitter(width=0.25, alpha=0.5)
ggplotly(tempJitter)
tempSimps2 <- tempSimps
tempSimps2$Season <- tempSimps2$season
tempPlot <- ggplot(data = tempSimps2, aes(x=id, y=rating, col=Season, shape=Season)) + geom_point() +
ylab("Rating") + xlab("Episode Number") + ggtitle("Rating vs Episode") +
geom_line() + scale_shape_manual(values = rep(0:14, 3)) + scale_y_continuous(name="IMDB Rating", limits=c(4,10))
ggplotly(tempPlot)
#ggplot(data = simpsons, aes(x=id, y = rating)) + geom_point()
#ggplot(data = simpsons, aes(x=original_air_date, y = rating)) + geom_point()
tempPlot <- ggplot(data=tempSimps2,aes(x=id, y=viewers, col=Season, shape=Season)) + geom_point() +
ylab("Viewer") + xlab("Episode Number") + ggtitle("Episode vs Viewer") + geom_line() + geom_line() +
scale_shape_manual(values = rep(0:14, 3))
#theme(legend.position = "none")
ggplotly(tempPlot)
#ggplot(data = simpsons, aes(x=id, y = rating)) + geom_point()
#ggplot(data = simpsons, aes(x=original_air_date, y = rating)) + geom_point()
| Set 1 (1 - 12) | Set 2 (13 - 23) | Set 3 (24 - 33) | |
|---|---|---|---|
| Increase | 10 | N/A | 28 |
| Neutral | 1, 4, 6, 11, 12 | 16 | 27 |
| Decrease | 2, 3, 5, 7, 8, 9 | 13-15, 17-23 | 24-26, 29-33 |
Error in season 33?
s33 <- get_season(33)
#tempGG <- ggplot(temp, aes(x = id, y = viewers)) + geom_point() +
# ggtitle(glue('Season {i}')) + xlab("Episode") + ylab("Viewers")
highlight_g <- data.frame(s33[1,])
highlight_g[2, ] <- tail(s33, n=1)
tempGG <- ggplot(s33, aes(x = id, y = viewers)) + geom_point() + ggtitle(glue('Season {i}')) + xlab("Episode") + ylab("Viewers") + geom_point(data=highlight_g, aes(x = id, y = viewers), col="red", size=3)
ggplotly(tempGG)
ids33_fixed <- arrange(s33, id)
highlight_g <- data.frame(s33_fixed[1,])
highlight_g[2, ] <- tail(s33_fixed, n=1)
tempGG <- ggplot(s33_fixed, aes(x = id, y = viewers)) + geom_point() + ggtitle(glue('Season {i}')) + xlab("Episode") + ylab("Viewers") + geom_point(data=highlight_g, aes(x = id, y = viewers), col="red", size=3)
ggplotly(tempGG)
oldViewers <- ggplot(data = old_simps,
aes(x = id, y = us_viewers_in_millions, col = as.factor(season))) +
geom_point() + ylab("Viewers") + xlab("Episode Number") +
ggtitle("Num Viewers vs Episode (Old)") +
theme(legend.position = "none") +
scale_y_discrete(breaks=seq(0, 12, 2))
ggplotly(oldViewers)
us_viewers_in_millions values hard-coded as
N/A (char class)
#simpsons$viewers <- as.numeric(simpsons$viewers)
#(refactoredViewer <- ggplot(data = simpsons, aes(x = id, y = viewers, col = as.factor(season))) +
# geom_point() + ylab("Viewers") + xlab("Episode Number") + ggtitle("Num Viewers vs Episode") +
# scale_y_continuous(name="Viewers (in mil)", limits=c(0,30)))
#ggplot(data = tempSimps, aes(x = id, y = viewers, col = as.factor(season))) + geom_point()
Upon refactoring viewer count to numeric trend does indeed follow expectations (as noted from prior graphs)
THOH 31 is lowest, and lower even within the immediate range / season
#length(unique(simpsons$writers))
#length(unique(simpsons$directors))
#mean(nchar(simpsons$writers))
avgWriter <- simpsons[which(nchar(simpsons$writers) > 16.6069 ),]
#ggplot(simpsons, aes(writers, fill = writers)) + geom_bar() +
#coord_polar("y", start = 0)
writers <- data.frame("writers" = tempSimps2$writers, "length"=nchar(tempSimps2$writers))
long_writer <- writers %>% filter(length > mean(length))
long_writer <- writers %>% filter(length >= 20)
short_writer <- writers %>% filter(length <= 19)
long_writer$writers <- str_remove_all(long_writer$writers, "Story by:")
#long_writer <- str_remove_all(long_writer$writers, "Story by:")
#———————-